EDA Analysis
- Load the dataset
listings.csv which includes information about AirBNB listings in Boston.
library(tidyverse)
listings <- read_csv("listings.csv")
Sys.setenv("MAPBOX_TOKEN" = "pk.eyJ1IjoicmljZW1hY2hpbmUiLCJhIjoiY2tuZXJ1Z2x0MDEweDJvcGF4OGZtcDR4ZiJ9.-vyiET9gaDK4jXrzueRdZw")
library(plotly)
(boston_airbnb <- listings)
## # A tibble: 2,959 x 16
## id name host_id host_name neighbourhood_g~ neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <lgl> <chr> <dbl>
## 1 3781 HARBORSIDE-W~ 4804 Frank NA East Boston 42.4
## 2 6695 $99 Special!~ 8229 Terry NA Roxbury 42.3
## 3 10813 Back Bay Apt~ 38997 Michelle NA Back Bay 42.4
## 4 10986 North End (W~ 38997 Michelle NA North End 42.4
## 5 13247 Back Bay stu~ 51637 Susan NA Back Bay 42.4
## 6 16384 Small Room i~ 23078 Eric NA Beacon Hill 42.4
## 7 18711 The Dorset R~ 71783 Lance NA Dorchester 42.3
## 8 22195 Copley House~ 85130 Copley NA Back Bay 42.3
## 9 22354 COPLEY SQ...~ 85770 Robert NA South End 42.3
## 10 40601 Private room~ 174986 Robert NA Jamaica Plain 42.3
## # ... with 2,949 more rows, and 9 more variables: longitude <dbl>,
## # room_type <chr>, price <dbl>, minimum_nights <dbl>,
## # number_of_reviews <dbl>, last_review <date>, reviews_per_month <dbl>,
## # calculated_host_listings_count <dbl>, availability_365 <dbl>
summary(listings$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 69.0 110.0 148.6 174.0 10000.0
head(listings$price[order(listings$price)], 20)
## [1] 0 0 0 0 19 20 21 21 23 25 25 25 25 25 26 26 26 27 27 28
tail(listings$price[order(listings$price)], 20)
## [1] 850 890 900 948 950 975 999 1000 1000 1000 1000 1000
## [13] 1052 1200 1700 1995 2116 3999 10000 10000
plot_ly(listings, y = ~price, type = "box")
# So, there are 8 units priced over 1000:
sum(listings$price > 1000)
## [1] 8
listings %>%
filter(price > 1000) %>%
select(name, room_type, price)
## # A tibble: 8 x 3
## name room_type price
## <chr> <chr> <dbl>
## 1 Large, modern 2br, 2ba renovated 6/16 with patio Entire home/apt 3999
## 2 The Historic House in the North End/Waterfront Entire home/apt 1052
## 3 Quiet getaway in a Boston home Entire home/apt 1200
## 4 Fun Retreat 13BR 23beds near Downtown FreePrk Entire home/apt 2116
## 5 Boston Homestel, Double Bed Private room 10000
## 6 Boston Homestel, 1 Double and 1 Single Bed Private room 10000
## 7 Renovated Studio Apartment in South Boston!!! Entire home/apt 1995
## 8 Bright Bedroom in Beautiful Apartment Private room 1700
# The 10000 look like plain errors: 10000 for a private room with a double bed?!
# The other private rooms seem to be just overpriced.
# Let's examine more carefully the "Entire home/apt" category:
gg_pricey <-
listings %>%
filter(room_type == "Entire home/apt") %>%
select(price) %>%
ggplot() +
geom_histogram(aes(x = log(price, 10)))
ggplotly(gg_pricey)
# And there are 4 units priced at zero:
sum(listings$price == 0)
## [1] 4
listings %>%
filter(price == 0) %>%
select(name, room_type, price)
## # A tibble: 4 x 3
## name room_type price
## <chr> <chr> <dbl>
## 1 The Revolution Hotel Hotel room 0
## 2 Inn @ St. Botolph Hotel room 0
## 3 Boston Fenway Inn Hotel room 0
## 4 citizenM Boston North Station Hotel room 0
# The zero entries are probably just wrong.
# To deal with those I choose to remove all price == 0 and all price > 1000.
# In addition, I take log10 price so to better work with the default colorscale:
boston_airbnb <-
listings %>%
filter(price > 0 & price <= 1000)
# Create a plot that demonstrates the effect of neighborhood on price.
boston_airbnb <-
listings %>%
filter(price > 0 & price <= 1000) %>%
mutate(log_price = log(price, 10))
lvls <-
boston_airbnb %>%
group_by(neighbourhood) %>%
summarise(m = median(price)) %>%
arrange(m) %>%
pull(neighbourhood)
(p0 <-
plot_ly(
boston_airbnb,
x = ~factor(neighbourhood, lvls),
y = ~price,
type = "box",
showlegend = FALSE,
name = "") %>%
layout(
yaxis = list(type = "log", title = "log(price)"),
xaxis = list(title = "", tickangle = -35)
)
)
# Organize the price data on a mapbox layer
p <-
plot_mapbox(boston_airbnb) %>%
add_markers(
x = ~longitude,
y = ~latitude,
color = ~log(price, 10),
name = "Log (base 10) of price",
text =
~paste(
name,
"\nRoom type:", room_type,
"\nPrice: ", price,
"\nMinimum nights: ", minimum_nights
),
hoverinfo = "text"
) %>%
layout(
mapbox =
list(
center = list(lat = 42.32, lon = -71.1),
zoom = 9.5,
style = "dark"
)
)
p
# GPX function
library(tmaptools)
mbta <- read_GPX("mbta.gpx")
stations <-
mbta$waypoints %>%
filter(grepl('Red Line|Green Line|Blue Line|Orange Line', type))
T_lines <-
mbta$tracks %>%
filter(grepl('Red Line|Green Line|Blue Line|Orange Line', name))
p %>%
add_sf(
data = stations,
inherit = FALSE,
name = "MBTA T stations",
text = ~name,
hoverinfo = "text"
) %>%
add_sf(
data = T_lines,
text = ~name,
hoverinfo = "text",
name = "MBTA T lines"
)
# Consider changing colors of lines according to color:
add_MBTA_line <- function(p, line_color) {
res <-
p %>%
add_sf(
data = T_lines %>% filter(grepl(line_color, name, ignore.case = TRUE)),
color = ~I(line_color),
text = ~name,
hoverinfo = "text",
name = paste0(line_color, " line")
)
return(res)
}
p1 <-
p %>%
add_sf(
data = stations,
name = "MBTA T stations",
text = ~paste0(name, " (", type, ")"),
hoverinfo = "text",
color = I("pink"),
size = I(30)
) %>%
add_MBTA_line("red") %>%
add_MBTA_line("green") %>%
add_MBTA_line("orange") %>%
add_MBTA_line("blue")
p1 %>% colorbar(title = "log(price)")
boston_neighborhoods <- sf::st_read("Boston_Neighborhoods.kml")
## Reading layer `Boston_Neighborhoods' from data source
## `C:\Users\jacob\OneDrive\Desktop\UMass Stat\STAT697V\HW\HW8\Boston_Neighborhoods.kml'
## using driver `KML'
## Simple feature collection with 26 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -71.19125 ymin: 42.22792 xmax: -70.92278 ymax: 42.39699
## Geodetic CRS: WGS 84
# Add the neighborhood boundaries to the map
p2 <-
p1 %>%
add_sf(
inherit = FALSE,
data = boston_neighborhoods,
fill = "",
name = "Neighborhoods Boundaries",
text = ~Name,
hoverinfo = "text"
) %>%
colorbar(title = "log(price)")
- Final Version of Boston Airbnb Visualization
subplot(p0, p2, nrows = 2, heights = c(0.2, 0.8), margin = 0.1)